library(tidyverse)
library(modelsummary)

url_df <- read_tsv("data/raw/url_reference_table.tsv")
domain_df <- read_tsv("data/raw/domain_reference_table.tsv")


qualtrics_codings <- haven::read_spss("data/qualtrics_hand_coding_scrubbed.sav") |> haven::zap_labels()
sampled_headlines <- read_tsv("data/sampled_headlines.tsv")

# If using the de-anonymized version, filter out cases with no names (these were for testing).
# qualtrics_codings <- filter(qualtrics_codings, QID5652 != "")

# drop one respondent to improve reliability
qualtrics_codings<- qualtrics_codings[-8,]

x <- qualtrics_codings |> 
    select(starts_with("url_")) |> 
    mutate_all(\(x) case_when(x== -99 ~ NA_real_, TRUE ~ x) )|> as.matrix() 

print(irr::kripp.alpha(x, method="ordinal"))

# re-order variables
x <- x[, c(901:1000, 801:900, 701:800, 601:700, 501:600, 401:500, 301:400, 201:300, 101:200, 1:100)]

sampled_headlines$hand_coding <- x |> t() |> rowMeans(na.rm=T)

print(cor(sampled_headlines$hand_coding, sampled_headlines$url_score, use="pairwise.complete.obs"))

sampled_headlines <- sampled_headlines |> inner_join(select(url_df, url_score_continuous = url_score_continuous, url), by="url")
sampled_headlines <- sampled_headlines |> inner_join(select(domain_df, domain_score_continuous = domain_score_continuous_political, domain), by="domain")


m1 <- lm(hand_coding ~ domain_score_continuous + url_score_continuous, data=sampled_headlines)
m2 <- lm(hand_coding ~ url_score_continuous, data=sampled_headlines)
m3 <- lm(hand_coding ~ domain_score_continuous, data=sampled_headlines)

ftest <- anova(m1, m2)
alt_ftest <- anova(m1, m3)

print(alt_ftest)
print(ftest)

print(AIC(m1, m2, m3))


options(repr.plot.width = 6, repr.plot.height = 4, repr.plot.res = 450)

p2 <- ggplot(sampled_headlines, aes(x=hand_coding, y=url_score_continuous, color=domain_score_continuous)) + 
    geom_point(alpha = 0.9, position = position_jitter()) + 
    geom_abline(intercept = 0, slope=1, linetype='dashed', color='black', alpha=0.8) + 
    geom_smooth(method="lm", linetype='solid', color='orange', alpha=0.8, se = FALSE) +
    theme_classic()+
    theme(text = element_text(family = "serif", size = 16),
          strip.text = element_text(face = "bold"),
          plot.title = element_text(face = "bold", size = 20)) +
    coord_cartesian(xlim=c(-1, 1), ylim=c(-1, 1)) +
    labs(y = "URL Score (Twitter)", 
         x = "Evaluated Partisan Appeal", 
         color = "Domain Score\n(Twitter)",
        )+
    scale_color_gradient2(midpoint=0, limits=c(-1, 1), high = "firebrick", mid = "gray50", low="dodgerblue")


ggsave("results/fig_6.pdf", p2, width = 12, height = 8, dpi = 450)
ggsave("results/hand_coding.pdf", p2, width = 12, height = 8, dpi = 450)

# placing this at the very end because it often errors (while still producing the output)
modelsummary(list("Full" = m1, "URL only" = m2, "Domain only" = m3), 
             output="results/table_e1.tex", 
             coef_rename = c("url_score_continuous" = "URL score", "domain_score_continuous" = "Domain score"))
